import numpy as np
import pandas as pd
data=pd.read_csv('data.tsv', sep='\t')
import re
data['date'] = data['news_posttime'].map(lambda s: re.compile(' ').split(s)[0])
data['hour']=data['news_posttime'].map(lambda s:re.compile(' ').split(s)[1][:2])
#计算字符串长度
data['len_news_digest']=data['news_digest'].str.len()
#选择数据,构建特征
data1=data.drop(columns=['news_posttime','news_url','news_digest','tag'],axis=1)
data1['hour']=data1['hour'].astype('int64')
data1['date']=pd.to_datetime(data1['date'])
data2=data1.groupby(by=['date','hour']).agg({'len_news_digest':sum})
pd.set_option('display.max_rows',None)
print(data2)